getwd() # Current working directory
setwd("C:/Users/EarthShape/Desktop/1-Projects/Earthshape/GEO77_R_course") # Set a working directory
setwd("C:\\Users\\EarthShape\\Desktop\\1-Projects\\Earthshape\\GEO77_R_course") # another valid way to select the same working directory
setwd("C:\Users\EarthShape\Desktop\1-Projects\Earthshape\GEO77_R_course") # an invalid way of selecting the same working directory
or go to Session/Set Working Directory/Choose directory… and select folder
or press Ctrl+Shift+H and select folder
Why?
R has the ability to load multiple file formats, which can be further extended through packages.
Reading a dataset read.table()
soil_data <- read.table("./data/soil_data.txt", header = TRUE)
names(soil_data)
## [1] "ID" "X_coord" "Y_coord" "Id"
## [5] "pH" "P" "K" "Mg"
## [9] "horiz" "upper_depth" "lower_depth" "stones"
## [13] "coarse_sand" "medium_sand" "fine_sand" "coarse_silt"
## [17] "medium_silt" "fine_silt" "clay" "soil_type_ka4"
## [21] "pH_cacl2"
str(soil_data)
## 'data.frame': 57 obs. of 21 variables:
## $ ID : int 0 1 2 3 4 5 6 7 8 9 ...
## $ X_coord : int 4428062 4427932 4428028 4428092 4428089 4428105 4428122 4428129 4428086 4427954 ...
## $ Y_coord : int 5749930 5749890 5749724 5749577 5749360 5749189 5749044 5748889 5748785 5748947 ...
## $ Id : int 110 111 112 113 114 115 116 117 118 119 ...
## $ pH : num 6.3 7 6.1 6.6 6 5.5 5.5 5.2 5.2 5.3 ...
## $ P : num 25.4 88 11.7 7.8 4.6 4.4 6.4 5.5 12.6 9.5 ...
## $ K : num 16 23 22 18 16 19 15 13 17 17 ...
## $ Mg : num 7.3 7.4 8.2 7.8 7.6 6.9 6 5.9 5.8 6.7 ...
## $ horiz : chr "Ap" "Ap" "Ap" "Ap" ...
## $ upper_depth : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lower_depth : int 3 3 3 3 3 3 3 3 3 3 ...
## $ stones : num 1.1 3.4 1.5 0.04 0.2 0.4 1.4 1.3 26.2 24.7 ...
## $ coarse_sand : num 1.4 3.2 2.1 0.8 0.6 0.8 2 2.2 8.3 8.9 ...
## $ medium_sand : num 5.8 14.7 6.5 1.1 1.1 1.5 2.9 2.9 8.7 9.7 ...
## $ fine_sand : num 27.6 6.9 3.9 1.7 1.7 2.8 3.7 3.9 8.9 9.3 ...
## $ coarse_silt : num 39.5 42.1 47.1 53.3 52.3 52.8 51.5 52.4 43.9 42.9 ...
## $ medium_silt : num 11 10.6 13.7 13.7 16.1 15.2 14.8 14.5 10 12 ...
## $ fine_silt : num 2.9 5.9 6.2 6.5 7.6 6.6 6.6 7.2 6.2 5.6 ...
## $ clay : num 11.8 16.6 20.5 22.9 20.6 20.3 18.5 16.9 14 11.6 ...
## $ soil_type_ka4: chr "Uls" "Uls" "Ut4" "Ut4" ...
## $ pH_cacl2 : num 7.33 7.26 5.87 6.79 5.77 6.3 6.46 5.79 5.21 7.25 ...
| ID | X_coord | Y_coord | Id | pH | P | K | Mg | horiz | upper_depth | lower_depth | stones | coarse_sand | medium_sand | fine_sand | coarse_silt | medium_silt | fine_silt | clay | soil_type_ka4 | pH_cacl2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4428062 | 5749930 | 110 | 6.3 | 25.4 | 16 | 7.3 | Ap | 0 | 3 | 1.10 | 1.4 | 5.8 | 27.6 | 39.5 | 11.0 | 2.9 | 11.8 | Uls | 7.33 |
| 1 | 4427932 | 5749890 | 111 | 7.0 | 88.0 | 23 | 7.4 | Ap | 0 | 3 | 3.40 | 3.2 | 14.7 | 6.9 | 42.1 | 10.6 | 5.9 | 16.6 | Uls | 7.26 |
| 2 | 4428028 | 5749724 | 112 | 6.1 | 11.7 | 22 | 8.2 | Ap | 0 | 3 | 1.50 | 2.1 | 6.5 | 3.9 | 47.1 | 13.7 | 6.2 | 20.5 | Ut4 | 5.87 |
| 3 | 4428092 | 5749577 | 113 | 6.6 | 7.8 | 18 | 7.8 | Ap | 0 | 3 | 0.04 | 0.8 | 1.1 | 1.7 | 53.3 | 13.7 | 6.5 | 22.9 | Ut4 | 6.79 |
| 4 | 4428089 | 5749360 | 114 | 6.0 | 4.6 | 16 | 7.6 | Ap | 0 | 3 | 0.20 | 0.6 | 1.1 | 1.7 | 52.3 | 16.1 | 7.6 | 20.6 | Ut4 | 5.77 |
| 5 | 4428105 | 5749189 | 115 | 5.5 | 4.4 | 19 | 6.9 | Ap | 0 | 3 | 0.40 | 0.8 | 1.5 | 2.8 | 52.8 | 15.2 | 6.6 | 20.3 | Ut4 | 6.30 |
| 6 | 4428122 | 5749044 | 116 | 5.5 | 6.4 | 15 | 6.0 | Ap | 0 | 3 | 1.40 | 2.0 | 2.9 | 3.7 | 51.5 | 14.8 | 6.6 | 18.5 | Ut4 | 6.46 |
| 7 | 4428129 | 5748889 | 117 | 5.2 | 5.5 | 13 | 5.9 | Ap | 0 | 3 | 1.30 | 2.2 | 2.9 | 3.9 | 52.4 | 14.5 | 7.2 | 16.9 | Ut3 | 5.79 |
| 8 | 4428086 | 5748785 | 118 | 5.2 | 12.6 | 17 | 5.8 | Ap | 0 | 3 | 26.20 | 8.3 | 8.7 | 8.9 | 43.9 | 10.0 | 6.2 | 14.0 | Uls | 5.21 |
| 9 | 4427954 | 5748947 | 119 | 5.3 | 9.5 | 17 | 6.7 | Ap | 0 | 3 | 24.70 | 8.9 | 9.7 | 9.3 | 42.9 | 12.0 | 5.6 | 11.6 | Uls | 7.25 |
| 10 | 4427887 | 5749114 | 120 | 6.5 | 18.9 | 22 | 7.0 | Ap | 0 | 3 | 10.50 | 8.4 | 8.0 | 8.7 | 40.3 | 11.5 | 6.1 | 17.0 | Lu | 6.27 |
| 11 | 4427836 | 5749319 | 121 | 6.1 | 7.2 | 28 | 9.1 | Ap | 0 | 3 | 1.20 | 2.6 | 2.5 | 6.4 | 48.0 | 15.2 | 4.5 | 20.8 | Ut4 | 6.81 |
| 12 | 4427950 | 5749243 | 122 | 5.5 | 7.1 | 21 | 7.0 | Ap | 0 | 3 | 33.90 | 7.3 | 8.8 | 7.3 | 44.3 | 12.5 | 4.3 | 15.5 | Uls | 5.86 |
| 13 | 4427929 | 5749496 | 123 | 6.0 | 5.1 | 19 | 7.6 | Ap | 0 | 3 | 0.20 | 0.7 | 1.5 | 2.6 | 52.0 | 15.7 | 5.8 | 21.7 | Ut4 | 6.04 |
| 14 | 4427807 | 5749625 | 124 | 6.4 | 7.9 | 22 | 9.5 | Ap | 0 | 3 | 0.60 | 1.1 | 2.2 | 4.3 | 50.6 | 13.7 | 6.0 | 22.1 | Ut4 | 6.29 |
| 15 | 4427786 | 5749495 | 125 | 6.3 | 9.5 | 34 | 7.6 | Ap | 0 | 3 | 1.00 | 2.2 | 2.6 | 4.5 | 51.5 | 13.4 | 6.0 | 19.8 | Ut4 | 5.88 |
| 16 | 4427579 | 5749612 | 126 | 6.9 | 21.1 | 25 | 5.1 | Ap | 0 | 3 | 8.70 | 2.4 | 3.0 | 7.3 | 44.6 | 12.5 | 7.6 | 22.6 | Lu | 7.05 |
| 17 | 4427668 | 5749765 | 127 | 6.7 | 17.0 | 24 | 6.6 | Ap | 0 | 3 | 12.70 | 5.7 | 7.3 | 7.7 | 42.9 | 10.3 | 4.0 | 22.1 | Lu | 7.10 |
| 18 | 4427780 | 5749847 | 128 | 6.2 | 31.3 | 31 | 9.7 | Ap | 0 | 3 | 0.50 | 1.3 | 3.2 | 5.7 | 47.8 | 11.7 | 6.4 | 23.9 | Ut4 | 5.49 |
| 19 | 4427563 | 5749885 | 129 | 7.1 | 23.6 | 29 | 5.9 | Ap | 0 | 3 | 3.30 | 2.9 | 3.8 | 5.0 | 47.6 | 11.6 | 5.1 | 24.0 | Lu | 7.36 |
| 20 | 4427461 | 5749819 | 130 | 7.0 | 24.3 | 32 | 7.3 | Ap | 0 | 3 | 0.90 | 3.2 | 3.1 | 13.3 | 42.5 | 16.2 | 5.0 | 16.7 | Uls | 7.02 |
| 21 | 4427347 | 5749787 | 131 | 6.8 | 20.1 | 27 | 6.8 | Ap | 0 | 3 | 1.70 | 10.2 | 5.5 | 31.9 | 27.8 | 7.5 | 4.2 | 12.9 | Sl4 | 6.91 |
| 22 | 4427303 | 5749690 | 132 | 7.2 | 25.0 | 21 | 3.0 | Ap | 0 | 3 | 2.60 | 28.8 | 7.5 | 23.1 | 19.1 | 5.8 | 3.8 | 11.9 | Sl3 | 7.29 |
| 23 | 4427560 | 5749481 | 133 | 7.0 | 20.5 | 18 | 4.1 | Ap | 0 | 3 | 6.60 | 21.2 | 6.9 | 40.2 | 14.8 | 2.9 | 3.1 | 10.9 | Sl3 | 6.78 |
| 24 | 4427704 | 5749249 | 134 | 6.7 | 11.1 | 20 | 7.0 | Ap | 0 | 3 | 32.80 | 22.1 | 12.2 | 16.8 | 23.3 | 8.7 | 6.9 | 10.0 | Sl3 | 6.35 |
| 25 | 4427785 | 5749011 | 135 | 7.1 | 7.6 | 17 | 8.1 | Ap | 0 | 3 | 3.20 | 0.3 | 0.7 | 4.6 | 57.4 | 14.8 | 6.6 | 15.6 | Ut3 | 7.49 |
| 26 | 4427618 | 5749083 | 136 | 6.2 | 5.9 | 22 | 8.7 | Ap | 0 | 3 | 0.60 | 0.8 | 1.7 | 6.8 | 51.5 | 15.9 | 7.2 | 16.1 | Ut3 | 6.22 |
| 27 | 4427660 | 5748900 | 137 | 6.5 | 6.1 | 19 | 10.4 | Ap | 0 | 3 | 0.30 | 0.5 | 0.8 | 5.2 | 52.5 | 14.6 | 7.5 | 18.9 | Ut4 | 6.80 |
| 28 | 4427849 | 5748858 | 138 | 7.2 | 8.5 | 16 | 7.5 | Ap | 0 | 3 | 0.20 | 0.4 | 1.0 | 6.7 | 46.1 | 17.5 | 8.6 | 19.7 | Ut4 | 7.40 |
| 29 | 4427440 | 5748925 | 139 | 5.7 | 3.4 | 16 | 8.8 | Ap | 0 | 3 | 0.20 | 0.3 | 0.8 | 5.1 | 48.1 | 17.6 | 6.1 | 22.0 | Ut4 | 5.16 |
| 30 | 4427476 | 5749111 | 140 | 6.1 | 3.6 | 14 | 8.2 | Ap | 0 | 3 | 0.20 | 0.5 | 0.8 | 4.7 | 53.1 | 13.0 | 6.6 | 21.3 | Ut4 | 6.56 |
| 31 | 4427512 | 5749330 | 141 | 6.6 | 7.0 | 20 | 7.9 | Ap | 0 | 3 | 0.40 | 0.4 | 0.7 | 4.3 | 58.7 | 15.1 | 4.2 | 16.6 | Ut3 | 7.46 |
| 32 | 4427357 | 5749507 | 142 | 7.0 | 11.4 | 18 | 5.6 | Ap | 0 | 3 | 0.30 | 1.4 | 1.9 | 11.0 | 47.1 | 15.0 | 4.9 | 18.7 | Ut4 | 7.19 |
| 33 | 4427194 | 5749687 | 143 | 7.3 | 16.5 | 26 | 6.9 | Ap | 0 | 3 | 0.20 | 1.1 | 1.4 | 7.2 | 51.4 | 15.1 | 6.6 | 17.2 | Ut4 | 7.41 |
| 34 | 4427095 | 5749673 | 144 | 7.4 | 13.7 | 18 | 8.8 | Ap | 0 | 3 | 0.20 | 0.8 | 1.1 | 5.2 | 50.7 | 14.5 | 6.8 | 20.9 | Ut4 | 7.42 |
| 35 | 4427179 | 5749506 | 145 | 7.3 | 12.2 | 19 | 6.4 | Ap | 0 | 3 | 0.20 | 0.3 | 0.9 | 7.6 | 50.3 | 16.5 | 7.7 | 16.7 | Ut3 | 7.46 |
| 36 | 4427373 | 5749355 | 146 | 7.1 | 11.0 | 24 | 7.5 | Ap | 0 | 3 | 11.10 | 2.8 | 2.8 | 16.8 | 30.4 | 9.3 | 8.5 | 29.4 | Lt2 | 7.31 |
| 37 | 4427351 | 5749188 | 147 | 6.3 | 4.1 | 19 | 8.3 | Ap | 0 | 3 | 0.70 | 1.3 | 1.7 | 7.3 | 48.4 | 14.3 | 7.6 | 19.4 | Ut4 | 6.51 |
| 38 | 4427197 | 5749195 | 148 | 5.8 | 6.7 | 25 | 7.9 | Ap | 0 | 3 | 5.00 | 2.7 | 2.9 | 13.6 | 43.7 | 11.7 | 7.0 | 18.4 | Lu | 4.93 |
| 39 | 4427135 | 5749339 | 149 | 7.2 | 7.7 | 29 | 6.2 | Ap | 0 | 3 | 15.90 | 7.8 | 5.7 | 14.6 | 26.0 | 12.2 | 9.0 | 24.7 | Ls2 | 7.43 |
| 40 | 4427010 | 5749504 | 150 | 7.3 | 11.3 | 17 | 8.8 | Ap | 0 | 3 | 0.20 | 0.4 | 0.7 | 8.1 | 46.4 | 18.6 | 9.6 | 16.2 | Ut3 | 7.33 |
| 41 | 4426918 | 5749618 | 151 | 7.2 | 13.7 | 21 | 10.2 | Ap | 0 | 3 | 0.10 | 0.7 | 1.3 | 5.7 | 47.6 | 18.7 | 6.2 | 19.8 | Ut4 | 7.32 |
| 42 | 4426723 | 5749512 | 152 | 7.3 | 15.8 | 18 | 6.8 | Ap | 0 | 3 | 1.60 | 1.0 | 0.9 | 10.7 | 47.4 | 15.2 | 6.2 | 18.6 | Ut4 | 7.44 |
| 43 | 4426881 | 5749397 | 153 | 7.3 | 13.5 | 18 | 11.3 | Ap | 0 | 3 | 0.10 | 0.3 | 1.0 | 7.3 | 60.1 | 10.4 | 5.6 | 15.3 | Ut3 | 7.44 |
| 44 | 4426941 | 5749277 | 154 | 7.2 | 15.0 | 28 | 7.7 | Ap | 0 | 3 | 1.10 | 1.7 | 2.0 | 22.0 | 40.2 | 12.4 | 6.5 | 15.2 | Uls | 7.16 |
| 45 | 4427011 | 5749134 | 155 | 6.8 | 11.0 | 23 | 8.5 | Ap | 0 | 3 | 1.20 | 1.3 | 1.5 | 15.5 | 40.9 | 15.1 | 6.6 | 19.1 | Lu | 6.44 |
| 46 | 4427115 | 5749038 | 156 | 6.4 | 21.8 | 29 | 8.4 | Ap | 0 | 3 | 9.20 | 2.5 | 2.8 | 15.6 | 42.6 | 15.1 | 6.6 | 14.8 | Uls | 6.37 |
| 47 | 4427237 | 5748957 | 157 | 5.6 | 6.4 | 20 | 8.2 | Ap | 0 | 3 | 0.50 | 0.5 | 2.5 | 8.5 | 44.5 | 17.8 | 7.5 | 18.7 | Ut4 | 5.01 |
| 48 | 4426845 | 5749062 | 158 | 7.1 | 11.4 | 18 | 8.7 | Ap | 0 | 3 | 13.80 | 1.1 | 1.2 | 40.8 | 28.7 | 7.8 | 4.7 | 15.7 | Slu | 7.39 |
| 49 | 4426762 | 5749297 | 159 | 7.2 | 14.6 | 22 | 7.1 | Ap | 0 | 3 | 0.10 | 0.4 | 0.9 | 7.4 | 49.5 | 15.3 | 7.5 | 19.0 | Ut4 | 7.41 |
| 50 | 4426567 | 5749453 | 160 | 7.2 | 19.1 | 20 | 8.1 | Ap | 0 | 3 | 1.20 | 0.7 | 0.9 | 10.3 | 54.6 | 11.9 | 8.1 | 13.5 | Ut3 | 7.09 |
| 51 | 4426632 | 5749287 | 161 | 7.1 | 12.1 | 17 | 10.8 | Ap | 0 | 3 | 0.10 | 0.4 | 1.0 | 8.8 | 48.4 | 14.9 | 8.1 | 18.4 | Ut4 | 7.18 |
| 52 | 4426676 | 5749108 | 162 | 7.3 | 14.2 | 18 | 7.5 | Ap | 0 | 3 | 5.00 | 1.3 | 2.1 | 18.2 | 43.7 | 12.3 | 4.6 | 17.8 | Lu | 7.41 |
| 53 | 4426481 | 5749154 | 163 | 7.3 | 18.2 | 17 | 8.6 | Ap | 0 | 3 | 1.60 | 1.4 | 1.6 | 19.6 | 36.1 | 12.7 | 4.8 | 23.8 | Lu | 7.43 |
| 54 | 4426426 | 5749333 | 164 | 7.3 | 25.5 | 20 | 8.0 | Ap | 0 | 3 | 1.00 | 1.4 | 1.2 | 9.7 | 52.4 | 12.3 | 5.5 | 17.5 | Ut4 | 7.44 |
| 55 | 4426255 | 5749196 | 165 | 6.9 | 14.5 | 17 | 8.5 | Ap | 0 | 3 | 0.60 | 1.1 | 1.4 | 14.5 | 47.5 | 14.6 | 5.0 | 15.9 | Ut3 | 6.06 |
| 56 | 4426093 | 5749162 | 166 | 7.2 | 21.2 | 19 | 6.2 | Ap | 0 | 3 | 1.30 | 1.3 | 1.8 | 15.1 | 50.0 | 12.5 | 4.4 | 14.9 | Ut3 | 7.31 |
Writing a data set (write.table)
write.table(soil_data, file = "output/this_is_a_copy_of_soil_data_made_with_R.csv", sep = ",",
row.names = FALSE, col.names = TRUE)
pH <- soil_data$pH
K <- soil_data$K
mean(pH)
## [1] 6.624561
median(pH)
## [1] 6.8
min(pH)
## [1] 5.2
max(pH)
## [1] 7.4
range(pH)
## [1] 5.2 7.4
sd(pH)
## [1] 0.6467724
summary(pH)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.200 6.200 6.800 6.625 7.200 7.400
some functions are not included in base R, but implemented via
packages. E.g. moments
library(moments)
skewness(pH)
## [1] -0.7153593
kurtosis(pH)
## [1] 2.315821
Pearson correlation coefficient cor()
\[{r_{xy}=\frac{\sum_{i = 1}^{n}(x_i-\overline{x})(y_{i}-\overline{y})}{\sqrt{\sum_{i = 1}^{n}(x_{i}-\overline{x})^2}{\sqrt{\sum_{i = 1}^{n}(y_{i}-\overline{y})^2}}}}\]
where:
and analogously for \({\bar {y}}\)
Examples:
So now with our data…
cor(pH, K)
## [1] 0.1480799
lm()in R, the linear models are written as
lm(y ~ x, data)
Example:
# loading some example data
pH <- c(7.25, 7.91, 5.43, 8.96, 3.21, 4.32, 6.45, 7.22, 9.1, 6.77, 8.43, 5.51, 7.5, 6.64, 4.55, 2.18, 9.79, 7.65)
Corg <- c(4.23, 3.22, 3.1, 7.39, 3.11, 2.37, 5.51, 5.01, 7.26, 6.4, 6.38, 3.37, 8.29, 6.93, 2.87, 0.12, 7.57, 9.4)
data <- as.data.frame(cbind(pH, Corg))
linearModel <- lm(pH ~ Corg , data = data)
summary(linearModel)
##
## Call:
## lm(formula = pH ~ Corg, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0397 -0.9752 -0.1567 0.9637 2.5869
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.1757 0.7173 4.427 0.000423 ***
## Corg 0.6669 0.1264 5.276 7.53e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.289 on 16 degrees of freedom
## Multiple R-squared: 0.635, Adjusted R-squared: 0.6122
## F-statistic: 27.84 on 1 and 16 DF, p-value: 7.535e-05
plot(linearModel)
When running this command, we obtain a set of 4 plots that are out of
the scope of this “introduction”, but for the curious ones:
This is just a visual check, not an air-tight proof, so it is somewhat subjective. But it allows us to see at-a-glance if our assumption is plausible, and if not, how the assumption is violated and what data points contribute to the violation.
cor()\[R^2≡1-\frac{\sum(y_{i}-\hat{y_{i}})^2}{\sum(y_{i}-\overline{y})^2}\]
\[R^2=r×r\]
cor(data$pH, linearModel$fitted.values) *
cor(data$pH, linearModel$fitted.values)
## [1] 0.6349996
\[RMSE=\sqrt{\frac{1}{n}\sum_{i = 1}^{n}{(\hat{y_{i}}-y_{i})^2}}\]
sqrt(1 / length(rbind(linearModel$residuals * sum(rbind(linearModel$residuals) ^ 2))))
## [1] 0.2357023
Base R does not include a function for RMSE in base R, but in R we can create custom functions as follows:
function_name <- function(input_x, input_y, ...) {
some_operations
}
rmse <- function(x, y) {
sqrt(mean((x - y) ^ 2))
}
Requires input
rmse(linearModel$fitted.values, Corg)
## [1] 1.668019
boxplot()boxplot(pH, Corg, names = c("pH value", "Corg content"), ylab = "pH value and Corg content")
What does the boxplot show?
hist()hist(pH,
xlab = "pH value",
ylab = "Frequency",
main = ""
)
plot(density()plot(density(pH),
xlab = "pH value",
ylab = "Frequency",
main = ""
)
plot(x, y)plot(pH,
Corg,
xlab = "pH value",
ylab = "Corg content"
)
Directly in the graphic routines (help with ?par)
col = ...pch = ..., sizes with
cex = ...main = ..., axis label with
xlab = ..., ylab = ...xlim = ...,
ylim = ...After drawing a graphic
lines(...) or
points(...) respectively.text(...) or
mtext(...)title(...)legend(...)Output adjustment
main Title of the diagramxlab labeling of the x-axisylab labeling of the y-axisbreaks frequency classes number of barscol color filling of the barscex gradual scaling of text sizehist(pH,
xlab = "pH value",
ylab = "Frequency",
main = "",
breaks = seq(0, 10, 2),
col = "red",
cex = 0.5
)
RomeWhen working with code, it is very common that there are several ways to solve the same problem.
Exercise: Frequency distribution grain size types KA4
And now for the frequency distribution grain size types KA4!
Hint: The three different version are made with a
barplot(), plot() and hist()
If you can’t solve it by yourself, you can look at the code and try to understand it, but I encourage you to try.
# Variant A
soil_type <- as.data.frame(table(soil_data$soil_type_ka4))
barplot(soil_type$Freq, space = 0, names = soil_type$Var1,
xlab = "Soil type", ylab = "Frequency",
main = "Frequency diagram KA4 Soil types")
# Variant B
plot(factor(soil_data$soil_type_ka4),
xlab = "Soil type",
ylab = "Frequency",
main = "Frequency diagram KA4 Soil types")
# Variant C
soil_type <- as.data.frame(table(soil_data$soil_type_ka4))
hist(as.numeric(factor(soil_data$soil_type_ka4)),
breaks = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9),
freq=TRUE,
xaxt = "n",
xlab = "Soil type",
ylab = "Frequency",
main = "Frequency diagram KA4 Soil types")
axis(side = 1, at = seq(0.5, 8.5, 1), labels = soil_type$Var1)
When you set graphical parameters in Rstudio using this method, it is important to set the properties, plot the figures and reset the properties to the way they were before. Otherwise, every time we plot something, it will follow the set parameters.
par( # set or query graphical parameters
mfrow = c(1, 3), # 1 x 3 pictures on one plot, equivalent to mfcol = c(3, 1)
mar = c(5.1, 4.1, 4.1, 2.1), # margins as c(bottom, left, top, right)
oma = c(0, 0, 0, 0), # outer margins in lines of text as c(bottom, left, top, right)
mgp = c(3, .1, 0), # margins line for axis title, axis label and axis line
las = 0, # label axis style
cex.lab = 1, # size of the labels
cex.axis = 1, # size of the axis annotation
xpd = FALSE # If FALSE, all plotting is clipped to the plot region, if TRUE, all plotting is clipped to the figure region, and if NA, all plotting is clipped to the device region.
)
plot(…); plot(…); plot(…)
par(mfrow = c(1, 1),
mar = c(5.1, 4.1, 4.1, 2.1),
oma = c(0, 0, 0, 0),
mgp = c(3, 1, 0),
las = 0,
cex.lab = 1,
cex.axis = 1,
)
In R it is possible to export images in the most common formats and can be extended from packages. The output format must be specified and the file will be generated by default in our working directory.
tiff("filename.tiff", width = 21, height = 8, units = "cm", res = 300)
par(…)
plot(…); plot(…); plot(…)
par(…)
dev.off()
No! R and Rstudio provide several tools to help us do this:
R
will ignore all the text that start with #. Is a good
practice to comment our code, is very useful if somebody else need to
understand your code or for yourself in the future. In Rstudio, you can
comment/uncomment the selected text with Ctrl+⇧Shift+CTab ↹, and it will display a list with the possible
parameters for that function. move up ↑ and down
↓ and select with ↵ Return.F1, write ?function,
help(function) or help('function') in the
Rstudio console, and it will display the corresponding documentation for
that function in the help panel.Many functions are not included in the basic version of R. Therefore, there is an almost confusing variety of additional libraries for special applications. Examples:
ggplot2 and lattice for advanced
graphicsdplyr, reshape2 and tidyr for
data manipulationsp and sf for spatial data (shapefiles,
geopackage, etc.)raster, terra, stars for
spatial raster datacaret as wrapping function for machine-learning
librariesRQGIS and RSAGA as bridges to QGIS and
SAGA GISIn the console or the script
install.packages(ggplot2) # install a library
library(ggplot2) # loading a library
require(ggplot2)
detach(ggplot2) # Removing a loaded library
rp <- c("DescTools", "rstudioapi") # required packages
ip <- rp[!(rp %in% installed.packages()[, "Package"])] # subset packages
if(length(ip)) install.packages(ip, dependencies = TRUE) # install packages
lapply(rp, require, character.only = TRUE) # load packages
rm(np, rp) # remove vectos
object <- read.table("file.txt", sep = "\t", dec = ".", header = TRUE)
write.table(object, "file.txt", sep = "\t", dec = ".", col.names = TRUE, row.names = FALSE, append = FALSE)
setwd(dirname(rstudioapi::getSourceEditorContext()$path))
Let’s start to explore the data with the tools we already know. In this case, we will use the variable P as example.
plot(soil_data$X_coord,
soil_data$Y_coord,
cex=soil_data$P*0.05)
Model: Limited representation of reality
Spatial modeling: Image of states or processes of locatable objects and their characteristic values
Data for spatial modeling:
library(sp)
soil_data_copy <- soil_data
coordinates(soil_data_copy) = ~X_coord + Y_coord
bubble(soil_data_copy,"pH")
library(gstat)
# Determination of the extension (xmin, xmax, ymin, ymax)
xmin <- min(soil_data$X_coord) - 100
xmax <- max(soil_data$X_coord) + 100
ymin <- min(soil_data$Y_coord) - 100
ymax <- max(soil_data$Y_coord) + 100
# Determination of the resolution (cellsize)
cellsize = 10
# Spanning a grid (raster format) for interpolation
grd<- expand.grid(x=seq(from= xmin, to= xmax, by= cellsize), y=seq(from= ymin, to= ymax, by= cellsize))
coordinates(grd) <- ~ x+y
gridded(grd) <- TRUE
Gstat library has for this the function
dataset.idw <- idw(soil_data$pH ~ 1, location = ~X_coord+Y_coord, soil_data, grd)
## [inverse distance weighted interpolation]
image(dataset.idw["var1.pred"],col=topo.colors(20))
points(soil_data$X_coord, soil_data$Y_coord, col ="blue")
dataset.idw<- idw(soil_data$pH ~ 1, location = ~X_coord+Y_coord, soil_data, grd)
## [inverse distance weighted interpolation]
image(dataset.idw["var1.pred"],col=topo.colors(20))
points(soil_data$X_coord, soil_data$Y_coord, col ="blue")
library(tripack)
#Define spatial objects
coordinates(soil_data) = c("X_coord", "Y_coord")
# Select the coordinate columns
cc = coordinates(soil_data)
# Overlay with IDW interpolation
plot(voronoi.mosaic(cc[, 1], cc[, 2]), do.points = FALSE, add = TRUE)
title("Thiessen (orVoronoi) polygon interpolation of pH-Value")
# Variogram
v <- variogram(pH ~ 1, locations = coordinates, data = soil_data, width = cellsize * 0.5)
# Variogram fit
v.fit <- fit.variogram(v, fit.method = TRUE, model = vgm(0.1, "Gau", 600))
# Output of the variogram
plot(v, model = v.fit)
krige() function.soil_data <- read.table("data/soil_data.txt", header = TRUE)
# ordinary kriging:
z <- krige(formula = pH ~ 1, locations = ~ X_coord + Y_coord, data = soil_data, newdata = grd, model = v.fit, nmax = 500)
## [using ordinary kriging]
image(z, col = topo.colors(20))
points(soil_data$X_coord, soil_data$Y_coord)
title("Ordinary kriging Interpolation of pH-Value")
# simple kriging:
z <- krige(pH ~ 1, locations = ~ X_coord + Y_coord, data = soil_data, newdata = grd, model = v.fit, nmax = 500, beta = mean(soil_data$pH))
## [using simple kriging]
image(z, col = topo.colors(20))
points(soil_data$X_coord, soil_data$Y_coord)
title("Simple kriging Interpolation of pH-Value")
Exercise:
Unlike ordinary kriging, in regression kriging the trend is no longer constant but a function of ’explanatory’ variables, for example:
\[\text{soil depth(x)}=\beta_0 + \beta_1\cdot\text{elevation(x)}+\beta_2\cdot\text{slope angle(x)}+\beta_3\cdot\text{vegetation density(x)}+\text{residual(x)}\]
soil_data <- read.table("data/soil_data.txt", header = TRUE)
# make spatial
class(soil_data)
## [1] "data.frame"
coordinates(soil_data) <- ~X_coord+Y_coord
class(soil_data)
## [1] "SpatialPointsDataFrame"
## attr(,"package")
## [1] "sp"
# fit a linear regression model and inspect the results
soil_data.lm <- lm(pH ~ P + coarse_sand, data = soil_data)
summary(soil_data.lm)
##
## Call:
## lm(formula = pH ~ P + coarse_sand, data = soil_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3671 -0.4684 0.1471 0.5299 0.7702
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.389603 0.133280 47.941 <2e-16 ***
## P 0.017871 0.007007 2.551 0.0136 *
## coarse_sand -0.005750 0.015170 -0.379 0.7062
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6222 on 54 degrees of freedom
## Multiple R-squared: 0.1075, Adjusted R-squared: 0.07446
## F-statistic: 3.253 on 2 and 54 DF, p-value: 0.04636
# append residuals to dataset
soil_data$residuals <- soil_data.lm$residuals
# define gstat object and compute experimental semivariogram
v.soil_gstat <- variogram(residuals~1, locations = soil_data, width = cellsize * 0.5)
v.soil_gstat
## np dist gamma dir.hor dir.ver id
## 1 1 99.9850 0.010998607 0 0 var1
## 2 2 107.7771 0.046114770 0 0 var1
## 3 1 112.5389 0.004214714 0 0 var1
## 4 1 118.4061 0.003586350 0 0 var1
## 5 1 121.4907 0.006136650 0 0 var1
## 6 4 132.1510 0.009999453 0 0 var1
## 7 4 137.3698 0.574491451 0 0 var1
## 8 6 143.1705 0.149921248 0 0 var1
## 9 5 147.0303 0.075935155 0 0 var1
## 10 2 153.4994 0.075790474 0 0 var1
## 11 7 157.4933 0.170352210 0 0 var1
## 12 2 162.2325 0.079053295 0 0 var1
## 13 7 167.1980 0.061759283 0 0 var1
## 14 2 172.2230 0.061463110 0 0 var1
## 15 7 177.5775 0.100320527 0 0 var1
## 16 11 182.1261 0.223024995 0 0 var1
## 17 8 187.2538 0.037000118 0 0 var1
## 18 4 192.5410 0.087470531 0 0 var1
## 19 4 197.0991 0.108876737 0 0 var1
## 20 5 202.7742 0.015618195 0 0 var1
## 21 8 208.1228 0.009038054 0 0 var1
## 22 4 212.7310 0.016819041 0 0 var1
## 23 3 218.2137 0.057450361 0 0 var1
## 24 7 222.1714 0.162670464 0 0 var1
## 25 5 227.5137 0.076349132 0 0 var1
## 26 1 230.5407 0.307771526 0 0 var1
## 27 10 237.5216 0.115916739 0 0 var1
## 28 4 242.2600 0.029486373 0 0 var1
## 29 8 247.2747 0.397063672 0 0 var1
## 30 2 252.6380 0.090376044 0 0 var1
## 31 5 258.3040 0.150960533 0 0 var1
## 32 5 262.6909 0.168113473 0 0 var1
## 33 3 268.0358 0.340586708 0 0 var1
## 34 5 272.1170 0.085448716 0 0 var1
## 35 6 278.1768 0.092900111 0 0 var1
## 36 6 283.2348 0.547249599 0 0 var1
## 37 8 287.2386 0.110861964 0 0 var1
## 38 7 292.7238 0.148818625 0 0 var1
## 39 6 297.2684 0.247600343 0 0 var1
## 40 4 301.7825 0.238142252 0 0 var1
## 41 4 308.8991 0.108845843 0 0 var1
## 42 6 311.8783 0.256073285 0 0 var1
## 43 8 317.9102 0.058779001 0 0 var1
## 44 10 322.3305 0.178677276 0 0 var1
## 45 3 328.5462 0.089396370 0 0 var1
## 46 7 331.6169 0.328627668 0 0 var1
## 47 6 337.0331 0.227776124 0 0 var1
## 48 5 343.0131 0.090012462 0 0 var1
## 49 5 347.5278 0.024428773 0 0 var1
## 50 10 352.7061 0.161957368 0 0 var1
## 51 5 357.4305 0.236790496 0 0 var1
## 52 9 362.6623 0.478512469 0 0 var1
## 53 8 368.1542 0.385785273 0 0 var1
## 54 5 372.9219 0.081201901 0 0 var1
## 55 9 376.5435 0.467590068 0 0 var1
## 56 5 383.1718 0.168616348 0 0 var1
## 57 8 388.4596 0.109943087 0 0 var1
## 58 8 392.6046 0.222274419 0 0 var1
## 59 6 396.8480 0.500231741 0 0 var1
## 60 8 402.7471 0.106241609 0 0 var1
## 61 8 407.1071 0.330509476 0 0 var1
## 62 8 412.5329 0.338562353 0 0 var1
## 63 10 418.0811 0.247000869 0 0 var1
## 64 7 422.4028 0.214956796 0 0 var1
## 65 6 427.5669 0.324089705 0 0 var1
## 66 7 432.1653 0.071761465 0 0 var1
## 67 9 436.6748 0.340504111 0 0 var1
## 68 5 442.5669 0.381615203 0 0 var1
## 69 10 447.9000 0.282259610 0 0 var1
## 70 5 453.3333 0.122436045 0 0 var1
## 71 4 457.8291 0.254807809 0 0 var1
## 72 10 462.8651 0.328102541 0 0 var1
## 73 8 466.8619 0.271779320 0 0 var1
## 74 4 472.2789 0.240494643 0 0 var1
## 75 10 477.0135 0.211068023 0 0 var1
## 76 5 482.2978 0.210653221 0 0 var1
## 77 7 487.3286 0.229633051 0 0 var1
## 78 11 492.5443 0.216835547 0 0 var1
## 79 4 497.7312 0.165298562 0 0 var1
## 80 7 502.5239 0.218787862 0 0 var1
## 81 13 506.6728 0.275003571 0 0 var1
## 82 10 513.5812 0.231679215 0 0 var1
## 83 7 517.0992 0.018404696 0 0 var1
## 84 7 521.6196 0.288641855 0 0 var1
## 85 6 527.3649 0.334282406 0 0 var1
## 86 8 531.7579 0.220623508 0 0 var1
## 87 5 538.4272 0.091338205 0 0 var1
## 88 6 541.8270 0.257255649 0 0 var1
## 89 9 547.5605 0.236774427 0 0 var1
## 90 10 552.1197 0.497280408 0 0 var1
## 91 6 556.8588 0.314068204 0 0 var1
## 92 8 562.5876 0.215222135 0 0 var1
## 93 7 567.7268 0.411711563 0 0 var1
## 94 11 572.8670 0.329016847 0 0 var1
## 95 10 578.0371 0.151259843 0 0 var1
## 96 10 582.6076 0.418255512 0 0 var1
## 97 7 587.8687 0.348029260 0 0 var1
## 98 4 593.5089 0.594013817 0 0 var1
## 99 4 597.8710 0.165097805 0 0 var1
## 100 7 602.4720 0.365239764 0 0 var1
## 101 11 608.0501 0.305591399 0 0 var1
## 102 11 611.9818 0.295710640 0 0 var1
## 103 8 617.3373 0.565019553 0 0 var1
## 104 2 623.6784 0.277962770 0 0 var1
## 105 7 627.5041 0.133425370 0 0 var1
## 106 8 632.8191 0.096482901 0 0 var1
## 107 7 637.3942 0.462755686 0 0 var1
## 108 8 642.4652 0.375255043 0 0 var1
## 109 8 648.6220 0.292981075 0 0 var1
## 110 9 652.3068 0.110480171 0 0 var1
## 111 6 657.0988 0.136732555 0 0 var1
## 112 10 660.9686 0.375934789 0 0 var1
## 113 6 668.5883 0.144723223 0 0 var1
## 114 6 672.7600 0.367456153 0 0 var1
## 115 9 678.2146 0.221570354 0 0 var1
## 116 13 682.8694 0.259129720 0 0 var1
## 117 12 688.4469 0.262266095 0 0 var1
## 118 8 692.3557 0.418412674 0 0 var1
## 119 5 698.0749 0.263215204 0 0 var1
## 120 5 702.4682 0.278038380 0 0 var1
## 121 7 707.6613 0.539628511 0 0 var1
## 122 3 713.0274 0.613283150 0 0 var1
## 123 8 716.7852 0.202098715 0 0 var1
## 124 5 721.5097 0.333817320 0 0 var1
## 125 7 727.9157 0.406522730 0 0 var1
## 126 8 732.4659 0.586090914 0 0 var1
## 127 12 738.7178 0.334040806 0 0 var1
## 128 6 742.4546 0.115571309 0 0 var1
## 129 4 748.0144 0.428347500 0 0 var1
## 130 14 752.2594 0.302066761 0 0 var1
## 131 10 757.7124 0.414623345 0 0 var1
## 132 6 763.0440 0.228175655 0 0 var1
## 133 5 767.9625 0.410167169 0 0 var1
## 134 10 773.0960 0.324050990 0 0 var1
## 135 7 776.8974 0.257897803 0 0 var1
plot(v.soil_gstat, plot.nu=TRUE)
# fit semivariogram model
fv.soil_gstat <- fit.variogram(v.soil_gstat, fit.method = TRUE, model = vgm(300, "Exp", 0.1))
## Warning in fit.variogram(v.soil_gstat, fit.method = TRUE, model = vgm(300, :
## singular model in variogram fit
# Output of the variogram
plot(v.soil_gstat, model = fv.soil_gstat, cutoff = 800)
# check if the residuals approximately follow a normal distribution
hist(soil_data$residuals)
# here they do so there is no need for a transformation
soil_data <- read.table("data/soil_data.txt", header = TRUE)
x <- krige.cv(pH ~ 1, ~ X_coord+Y_coord, data = soil_data, maxdist = 500, nfold = 10)
cor(x$observed, x$var1.pred) * cor(x$observed, x$var1.pred) # Rsquared
## [1] 0.6039993
sqr_error <- (x$var1.pred - x$observed )^2
sqrt(sum(sqr_error)/length(sqr_error)) # RMSE
## [1] 0.4072826
This is a simple example about how to built a decision tree. For this
example, we will use the package rpart
library(rpart)
soil_model <- rpart(pH ~ P + coarse_sand, data = soil_data, method = "anova", control = rpart.control(cp = 0))
The structure of a decision/classification trees can be depicted visually, which helps to understand how the tree makes its decisions.
# Load the rpart.plot package
library(rpart.plot)
# Plot the soil_model with default settings
rpart.plot(soil_model)
A classification tree grows using a divide-and-conquer process. Each time the tree grows larger, it splits groups of data into smaller subgroups, creating new branches in the tree. This process always looks to create the split resulting in the greatest improvement to purity (subgroup homogeneity).
Classification trees tend to grow easily due to they divide the data using one variable value, producing the most pure partition each time:
Before building a more sophisticated pH model, it is important to hold out a portion of the data to simulate how well it will predict the pH of unknown data points.
As depicted in the following image, you can use 75% of the observations for training and 25% for testing the model.
The sample() function can be used to generate a random
sample of rows to include in the training set. Simply supply it the
total number of observations and the number needed for training. Then we
can use the resulting vector of row IDs to subset the samples into
training and testing datasets as:
# Determine the number of rows for training
nrow(soil_data)
## [1] 57
nrow(soil_data) * 0.75
## [1] 42.75
# Create a random sample of row IDs
sample_rows <- sample(nrow(soil_data), nrow(soil_data) * 0.75)
# Create the training dataset
soil_train <- soil_data[sample_rows, ]
# Create the test dataset
soil_test <- soil_data[-sample_rows, ]
str(soil_train)
## 'data.frame': 42 obs. of 21 variables:
## $ ID : int 14 37 8 27 23 47 16 17 12 33 ...
## $ X_coord : int 4427807 4427351 4428086 4427660 4427560 4427237 4427579 4427668 4427950 4427194 ...
## $ Y_coord : int 5749625 5749188 5748785 5748900 5749481 5748957 5749612 5749765 5749243 5749687 ...
## $ Id : int 124 147 118 137 133 157 126 127 122 143 ...
## $ pH : num 6.4 6.3 5.2 6.5 7 5.6 6.9 6.7 5.5 7.3 ...
## $ P : num 7.9 4.1 12.6 6.1 20.5 6.4 21.1 17 7.1 16.5 ...
## $ K : num 22 19 17 19 18 20 25 24 21 26 ...
## $ Mg : num 9.5 8.3 5.8 10.4 4.1 8.2 5.1 6.6 7 6.9 ...
## $ horiz : chr "Ap" "Ap" "Ap" "Ap" ...
## $ upper_depth : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lower_depth : int 3 3 3 3 3 3 3 3 3 3 ...
## $ stones : num 0.6 0.7 26.2 0.3 6.6 0.5 8.7 12.7 33.9 0.2 ...
## $ coarse_sand : num 1.1 1.3 8.3 0.5 21.2 0.5 2.4 5.7 7.3 1.1 ...
## $ medium_sand : num 2.2 1.7 8.7 0.8 6.9 2.5 3 7.3 8.8 1.4 ...
## $ fine_sand : num 4.3 7.3 8.9 5.2 40.2 8.5 7.3 7.7 7.3 7.2 ...
## $ coarse_silt : num 50.6 48.4 43.9 52.5 14.8 44.5 44.6 42.9 44.3 51.4 ...
## $ medium_silt : num 13.7 14.3 10 14.6 2.9 17.8 12.5 10.3 12.5 15.1 ...
## $ fine_silt : num 6 7.6 6.2 7.5 3.1 7.5 7.6 4 4.3 6.6 ...
## $ clay : num 22.1 19.4 14 18.9 10.9 18.7 22.6 22.1 15.5 17.2 ...
## $ soil_type_ka4: chr "Ut4" "Ut4" "Uls" "Ut4" ...
## $ pH_cacl2 : num 6.29 6.51 5.21 6.8 6.78 5.01 7.05 7.1 5.86 7.41 ...
str(soil_test)
## 'data.frame': 15 obs. of 21 variables:
## $ ID : int 1 5 6 13 19 20 24 28 36 39 ...
## $ X_coord : int 4427932 4428105 4428122 4427929 4427563 4427461 4427704 4427849 4427373 4427135 ...
## $ Y_coord : int 5749890 5749189 5749044 5749496 5749885 5749819 5749249 5748858 5749355 5749339 ...
## $ Id : int 111 115 116 123 129 130 134 138 146 149 ...
## $ pH : num 7 5.5 5.5 6 7.1 7 6.7 7.2 7.1 7.2 ...
## $ P : num 88 4.4 6.4 5.1 23.6 24.3 11.1 8.5 11 7.7 ...
## $ K : num 23 19 15 19 29 32 20 16 24 29 ...
## $ Mg : num 7.4 6.9 6 7.6 5.9 7.3 7 7.5 7.5 6.2 ...
## $ horiz : chr "Ap" "Ap" "Ap" "Ap" ...
## $ upper_depth : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lower_depth : int 3 3 3 3 3 3 3 3 3 3 ...
## $ stones : num 3.4 0.4 1.4 0.2 3.3 0.9 32.8 0.2 11.1 15.9 ...
## $ coarse_sand : num 3.2 0.8 2 0.7 2.9 3.2 22.1 0.4 2.8 7.8 ...
## $ medium_sand : num 14.7 1.5 2.9 1.5 3.8 3.1 12.2 1 2.8 5.7 ...
## $ fine_sand : num 6.9 2.8 3.7 2.6 5 13.3 16.8 6.7 16.8 14.6 ...
## $ coarse_silt : num 42.1 52.8 51.5 52 47.6 42.5 23.3 46.1 30.4 26 ...
## $ medium_silt : num 10.6 15.2 14.8 15.7 11.6 16.2 8.7 17.5 9.3 12.2 ...
## $ fine_silt : num 5.9 6.6 6.6 5.8 5.1 5 6.9 8.6 8.5 9 ...
## $ clay : num 16.6 20.3 18.5 21.7 24 16.7 10 19.7 29.4 24.7 ...
## $ soil_type_ka4: chr "Uls" "Ut4" "Ut4" "Ut4" ...
## $ pH_cacl2 : num 7.26 6.3 6.46 6.04 7.36 7.02 6.35 7.4 7.31 7.43 ...